{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-21T16:57:51.682859Z",
     "start_time": "2020-05-21T16:57:51.663193Z"
    }
   },
   "outputs": [],
   "source": [
    "%%HTML\n",
    "<style> td {font-size: 18px} </style>\n",
    "<style> tr {font-size: 18px} </style>\n",
    "<style> li {font-size: 18px} </style>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-21T16:57:53.937820Z",
     "start_time": "2020-05-21T16:57:52.634431Z"
    }
   },
   "outputs": [],
   "source": [
    "from IPython.core.display import display, HTML\n",
    "from utils import load_data, print_data_stats, subset_data\n",
    "from sklearn.naive_bayes import MultinomialNB\n",
    "from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Introduction to the dataset\n",
    "- What is Sentiment Analysis?\n",
    "    - \"I like this movie\" --> positive\n",
    "    - \"I hate this movie\" --> negative\n",
    "- [ABSA multilingual SA dataset](http://alt.qcri.org/semeval2016/task5/)\n",
    "- What kind of preprocessing can you think of from the examples below"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-21T16:57:54.731139Z",
     "start_time": "2020-05-21T16:57:54.693076Z"
    }
   },
   "outputs": [],
   "source": [
    "LANGS = [\"ar\",\"en\",\"es\",\"ru\",\"zh\"]\n",
    "LANGS_MAPPING = {\"en\":\"english\",\"es\":\"spanish\",\"ru\":\"russian\",\"ar\":\"arabic\",\"zh\":\"chinese\"}\n",
    "\n",
    "data = load_data()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-21T07:51:13.457405Z",
     "start_time": "2020-05-21T07:51:13.429689Z"
    }
   },
   "outputs": [],
   "source": [
    "print_data_stats(data, max_len=40)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "- re-sample the data to make all languages to have the same number of training data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-18T15:26:42.865935Z",
     "start_time": "2020-05-18T15:26:42.828476Z"
    }
   },
   "outputs": [],
   "source": [
    "data_sampled = subset_data(data)\n",
    "print_data_stats(data_sampled,40)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Activity: build your own dataset (20 for train, 5 for test)\n",
    "- try to use similar words as much as possible\n",
    "- try to make some word overlaps between examples"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-21T07:53:32.276540Z",
     "start_time": "2020-05-21T07:53:32.260881Z"
    }
   },
   "outputs": [],
   "source": [
    "NEW_LANG = \"?\"\n",
    "train_pos_sents = [\"I like this movie\",\"Ihe nkiri a masịrị m\", \"Ninapenda sinema hii\"]\n",
    "train_neg_sents = [\"I have this movie\"]\n",
    "test_pos_sents = [\"I enjoyed the movie\"]\n",
    "test_neg_sents = [\"Never watch it\"]\n",
    "\n",
    "data[NEW_LANG] = {}\n",
    "data[NEW_LANG][\"train\"] = [(sent,\"pos\") for sent in train_pos_sents] + [(sent,\"neg\") for sent in train_neg_sents]\n",
    "data[NEW_LANG][\"test\"] = [(sent,\"pos\") for sent in train_pos_sents] + [(sent,\"neg\") for sent in train_neg_sents]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Load stemmers, word_tokenizers, stopword_filters\n",
    "- **stemming/lemmatization**: reducing inflected (or sometimes derived) words to their word stem\n",
    "- **word segmentation (tokenization)**: dividing a string of written language into its component words\n",
    "- **stopwords**: a set of commonly used words"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-21T07:58:17.912972Z",
     "start_time": "2020-05-21T07:58:17.872548Z"
    }
   },
   "outputs": [],
   "source": [
    "import Stemmer\n",
    "import stopwordsiso as stopwordsiso\n",
    "import jieba\n",
    "from pyarabic import araby \n",
    "from nltk.tokenize import word_tokenize\n",
    "from nltk.tokenize.toktok import ToktokTokenizer\n",
    "from nltk.corpus import stopwords\n",
    "import stopwordsiso\n",
    "\n",
    "class MultiStopword:\n",
    "    def __init__(self):\n",
    "        self.stopwords = {}\n",
    "        for lang in [\"en\",\"es\",\"ar\",\"ru\"]:\n",
    "            self.stopwords[lang] = set(stopwords.words(LANGS_MAPPING[lang]))\n",
    "        for lang in [\"zh\"]:\n",
    "            self.stopwords[lang] = stopwordsiso.stopwords(lang) \n",
    "        \n",
    "        ## TODO\n",
    "        self.stopwords[NEW_LANG] = set([\"\"])\n",
    "            \n",
    "    def is_stopword(self, word,lang):\n",
    "        if lang in self.stopwords:\n",
    "            return (word in self.stopwords[lang])\n",
    "        else:\n",
    "            raise NotImplementedError\n",
    "\n",
    "class MultiWordSegmenter:\n",
    "    def __init__(self):\n",
    "        self.tokenizer = {}\n",
    "        self.tokenizer[\"ru\"] = ToktokTokenizer()\n",
    "\n",
    "    def segment(self, text, lang):\n",
    "        if lang in [\"en\",\"es\"]:\n",
    "            return word_tokenize(text, language=LANGS_MAPPING[lang])\n",
    "        elif lang == \"zh\":\n",
    "            return jieba.cut(text)\n",
    "        elif lang == \"ru\":\n",
    "            return self.tokenizer[\"ru\"].tokenize(text)\n",
    "        elif lang == \"ar\":\n",
    "            return araby.tokenize(text)\n",
    "        ## TODO\n",
    "        elif lang == NEW_LANG:\n",
    "            return word_tokenize(text)\n",
    "        else:\n",
    "            raise NotImplementedError\n",
    "\n",
    "class MultiWordStemmers:\n",
    "    def __init__(self):\n",
    "        self.stemmers = {}\n",
    "        self.stemmers[\"en\"] = Stemmer.Stemmer('english')\n",
    "        self.stemmers[\"ar\"] = Stemmer.Stemmer('arabic')\n",
    "        self.stemmers[\"ru\"] = Stemmer.Stemmer('russian')\n",
    "        self.stemmers[\"es\"] = Stemmer.Stemmer('spanish')\n",
    "\n",
    "    def stem(self, word, lang):\n",
    "        if lang in self.stemmers:\n",
    "            return self.stemmers[lang].stemWord(word)\n",
    "        elif lang == \"zh\":\n",
    "            return word\n",
    "        elif lang == NEW_LANG:\n",
    "            ## TODO\n",
    "            return word\n",
    "        else:\n",
    "            raise NotImplementedError\n",
    "\n",
    "stopword_checkers = MultiStopword()\n",
    "word_segmenters = MultiWordSegmenter()\n",
    "stemmers = MultiWordStemmers()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "- **Examples:**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-18T15:27:16.550831Z",
     "start_time": "2020-05-18T15:27:16.542494Z"
    }
   },
   "outputs": [],
   "source": [
    "print(stemmers.stem(\"friend\",\"en\"))\n",
    "print(stemmers.stem(\"friends\",\"en\"))\n",
    "print(stemmers.stem(\"friended\",\"en\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-18T15:32:44.028259Z",
     "start_time": "2020-05-18T15:32:44.020115Z"
    }
   },
   "outputs": [],
   "source": [
    "# russian verbs for MUST\n",
    "print(stemmers.stem(\"должен\",\"ru\")) # Male\n",
    "print(stemmers.stem(\"должна\",\"ru\")) # Female\n",
    "print(stemmers.stem(\"должно\",\"ru\")) # Neutral\n",
    "print(stemmers.stem(\"должны\",\"ru\")) # Plural"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-21T07:57:16.517005Z",
     "start_time": "2020-05-21T07:57:16.504323Z"
    }
   },
   "outputs": [],
   "source": [
    "def preprocessing_example(sentence, lang):\n",
    "    print(\"\\n\".join([str((stemmers.stem(w,lang), stopword_checkers.is_stopword(w,lang))) for w in word_segmenters.segment(sentence,lang)]))\n",
    "    print(\" \".join([w for w in word_segmenters.segment(sentence,lang) if not stopword_checkers.is_stopword(w,lang)]))\n",
    "\n",
    "ex_sentence = \"Mr.Brown measured the cat this morning, and it was 14.5 pounds!\"\n",
    "preprocessing_example(ex_sentence, \"en\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-21T07:55:30.612149Z",
     "start_time": "2020-05-21T07:55:30.606646Z"
    }
   },
   "source": [
    "# Activity: add stemmers, word_tokenizers, stopwords for your language\n",
    "- find and edit `## TODO`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-21T07:11:04.482933Z",
     "start_time": "2020-05-21T07:11:04.474637Z"
    }
   },
   "outputs": [],
   "source": [
    "# and test out!\n",
    "ex_sentence = \"your sentence!\"\n",
    "preprocessing_example(ex_sentence, NEW_LANG)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Activity 2: reduce the number of features (# of unigrams)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-21T07:11:14.887051Z",
     "start_time": "2020-05-21T07:11:14.570872Z"
    }
   },
   "outputs": [],
   "source": [
    "baseline = {}\n",
    "vectorizer = CountVectorizer(ngram_range=(1, 1), lowercase=False)\n",
    "\n",
    "for lang in data_sampled.keys():\n",
    "    sentences_train, y_train = zip(*data_sampled[lang][\"train\"])\n",
    "    vectorizer.fit(sentences_train)\n",
    "    num_unigram = len(vectorizer.get_feature_names())\n",
    "    baseline[lang] = num_unigram\n",
    "    print(lang, num_unigram)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-13T17:53:36.375859Z",
     "start_time": "2020-05-13T17:53:35.511894Z"
    },
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "def preprocess(sentence_list:list, lang:str) -> list:\n",
    "    return [preprocess_sentence(sentence, lang) for sentence in sentence_list]\n",
    "\n",
    "def preprocess_sentence(text:str, lang:str) -> str:\n",
    "    text = text.lower()\n",
    "    ## TODO\n",
    "    words = text.split()\n",
    "#     words = word_segmenters.segm2enot stopword_checkers.is_stopword(w, lang)]\n",
    "    text = \" \".join(words)\n",
    "    return text\n",
    "\n",
    "vectorizer = CountVectorizer(ngram_range=(1, 1), lowercase=False)\n",
    "for lang in LANGS:\n",
    "    sentences_train, y_train = zip(*data_sampled[lang][\"train\"])\n",
    "    sentences_train = preprocess(sentences_train, lang)\n",
    "    vectorizer.fit(sentences_train)\n",
    "    num_unigram = len(vectorizer.get_feature_names())\n",
    "    print(f\"{lang}: {num_unigram:<5}({baseline[lang]-num_unigram}\\u2193)\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Train Naive Bayes models"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-21T07:47:37.366345Z",
     "start_time": "2020-05-21T07:47:37.339195Z"
    }
   },
   "outputs": [],
   "source": [
    "def preprocess(sentence_list:list, lang:str, bool_lowercase=True, bool_segment=True, bool_stem=True, filter_stopwords=False) -> list:\n",
    "    return [preprocess_sentence(sentence, lang, bool_lowercase, bool_segment, bool_stem, filter_stopwords) for sentence in sentence_list]\n",
    "\n",
    "def preprocess_sentence(text:str, lang:str, bool_lowercase, bool_segment, bool_stem, filter_stopwords) -> str:\n",
    "    if bool_lowercase:\n",
    "        text = text.lower()\n",
    "\n",
    "    if bool_segment:\n",
    "        words = word_segmenters.segment(text, lang)\n",
    "    else:\n",
    "        words = text.split()\n",
    "\n",
    "    if bool_stem:\n",
    "        words = [stemmers.stem(w, lang) for w in words]\n",
    "    \n",
    "    if filter_stopwords:\n",
    "        words = [w for w in words if not stopword_checkers.is_stopword(w, lang)]\n",
    "\n",
    "    return \" \".join(words)\n",
    "\n",
    "def train_and_evaluate_nb(data:dict, lang:str, max_feat=100) -> float:\n",
    "    sentences_train, y_train = zip(*data[lang][\"train\"])\n",
    "    sentences_test, y_test = zip(*data[lang][\"test\"])\n",
    "    \n",
    "    sentences_train, sentences_test = preprocess(sentences_train, lang), preprocess(sentences_test, lang)\n",
    "    vectorizer = CountVectorizer(ngram_range=(1, 1), max_features=max_feat, lowercase=False)\n",
    "    x_train = vectorizer.fit_transform(sentences_train)\n",
    "    x_test = vectorizer.transform(sentences_test)    \n",
    "    \n",
    "    model = MultinomialNB()\n",
    "    model.fit(x_train, y_train)\n",
    "    acc = model.score(x_test, y_test)\n",
    "    print(f\"{lang}: {acc:.2f}\")\n",
    "    return {\"model\":model, \"vectorizer\":vectorizer}\n",
    "\n",
    "def predict(models, lang, sents):\n",
    "    model, vectorizer = models[lang][\"model\"], models[lang][\"vectorizer\"]\n",
    "    if type(sents) == str:\n",
    "        sents = [sents]\n",
    "    sents = preprocess(sents, lang)\n",
    "    x = vectorizer.transform(sents)\n",
    "    pred = model.predict(x)\n",
    "    print(list(zip(sents,pred)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-21T07:49:13.125301Z",
     "start_time": "2020-05-21T07:49:11.024168Z"
    }
   },
   "outputs": [],
   "source": [
    "models = {}\n",
    "for lang in data.keys():\n",
    "    models[lang] = train_and_evaluate_nb(data, lang)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-05-21T08:01:28.589927Z",
     "start_time": "2020-05-21T08:01:28.581344Z"
    }
   },
   "outputs": [],
   "source": [
    "ex_sents = [\"will watch it again\",\"will not watch it again\"]\n",
    "print(models[\"en\"][\"vectorizer\"].transform(ex_sents))\n",
    "predict(models,\"en\",ex_sents)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Activity 3: Fill out the following table\n",
    "- change boolean arguments in `preprocess()`\n",
    "\n",
    "|                          \t| Ar \t| En \t| Es \t| Ru \t| Zh \t| NEW_LANG \t|\n",
    "|--------------------------\t|----\t|:--:\t|:--:\t|----\t|----\t|----------\t|\n",
    "| Baseline                 \t|    \t|    \t|    \t|    \t|    \t|          \t|\n",
    "| All                      \t|    \t|    \t|    \t|    \t|    \t|          \t|\n",
    "| All - segmentation       \t|    \t|    \t|    \t|    \t|    \t|          \t|\n",
    "| All - stemmer            \t|    \t|    \t|    \t|    \t|    \t|          \t|\n",
    "| All - stopword_filtering \t|    \t|    \t|    \t|    \t|    \t|          \t|"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Activity 4: Explain your own observations"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "- observation 1:\n",
    "- observation 2:"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [conda env:etc]",
   "language": "python",
   "name": "conda-env-etc-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  },
  "nbTranslate": {
   "displayLangs": [
    "*"
   ],
   "hotkey": "alt-t",
   "langInMainMenu": true,
   "sourceLang": "en",
   "targetLang": "fr",
   "useGoogleTranslate": true
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
